Tidyverse

Author

Davis Rakhshan

1 Challenge 1

Question 1:

Analyze the sales by location (state) with a bar plot. Since state and city are multiple features (variables), they should be split. Which state has the highes revenue?

Code:

# Challenge 1
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
#> ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#> ✔ purrr     1.0.2     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("readxl")

# Importing Files
bikes_tbl      <- read_excel(path = "../../scripts/01_bike_sales/01_raw_data/bikes.xlsx")
bikeshops_tbl  <- read_excel(path = "../../scripts/01_bike_sales/01_raw_data/bikeshops.xlsx")
orderlines_tbl <- read_excel(path = "../../scripts/01_bike_sales/01_raw_data/orderlines.xlsx")
#> New names:
#> • `` -> `...1`
# Joining Data
left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id"))
# Chaining commands with the pipe and assigning it to order_items_joined_tbl
bike_orderlines_joined_tbl <- orderlines_tbl %>%
  left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>%
  left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))

# Wrangling Data
bike_orderlines_joined_tbl %>% 
  select(category) %>%
  filter(str_detect(category, "^Mountain")) %>% 
  unique()
bike_orderlines_wrangled_tbl <- bike_orderlines_joined_tbl %>%
  separate(col    = category,
           into   = c("category.1", "category.2", "category.3"),
           sep    = " - ") %>%
  mutate(total.price = price * quantity) %>%
  select(-...1, -gender) %>%
  select(-ends_with(".id")) %>%
  bind_cols(bike_orderlines_joined_tbl %>% select(order.id)) %>% 
  select(order.id, contains("order"), contains("model"), contains("category"),
         price, quantity, total.price,
         everything()) %>%
  rename(bikeshop = name) %>%
  set_names(names(.) %>% str_replace_all("\\.", "_"))

# Separate cities and states
bike_orderlines_wrangled_tbl_states <- separate(bike_orderlines_wrangled_tbl,
                                                col = location,
                                                into = c("city", "state"),
                                                sep = ", ",
                                                convert = T)
bike_orderlines_wrangled_tbl_states
sales_by_year_and_state_tbl <- bike_orderlines_wrangled_tbl_states %>%
  select(order_date, total_price, state) %>%
  mutate(year = year(order_date)) %>%
  group_by(state) %>% 
  summarize(sales = sum(total_price)) %>%
  mutate(sales_text = scales::dollar(sales, big.mark = ".", 
                                     decimal.mark = ",", 
                                     prefix = "", 
                                     suffix = " €"))
sales_by_year_and_state_tbl
sales_by_year_and_state_tbl %>%
  ggplot(aes(x = state, y = sales)) +
  
  # Geometries
  geom_col(fill = "#2DC6D6") + # Use geom_col for a bar plot
  geom_label(aes(label = sales_text)) + # Adding labels to the bars
  geom_smooth(method = "lm", se = FALSE) + # Adding a trendline
  
  # Formatting
  # scale_y_continuous(labels = scales::dollar) + # Change the y-axis. 
  # Again, we have to adjust it for euro values
  scale_y_continuous(labels = scales::dollar_format(big.mark = ".", 
                                                    decimal.mark = ",", 
                                                    prefix = "", 
                                                    suffix = " €")) +
  labs(
    title    = "Revenue by state",
    x = "States",
    y = "Revenue"
  ) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
#> `geom_smooth()` using formula = 'y ~ x'

2 Challenge 2

Question 2:

Analyze the sales by location and year (facet_wrap). Because there are 12 states with bike stores, you should get 12 plots.

Code:

# Get state's sales per year
sales_by_state_in_year  <- separate(bike_orderlines_wrangled_tbl,
                                    col = location,
                                    into = c("city", "state"),
                                    sep = ", ",
                                    convert = T)

sales_by_state_in_year
sales_by_state_in_year <- sales_by_state_in_year %>%
  
  # Select columns
  select(order_date, total_price, state) %>%
  
  # Add year column
  mutate(year = year(order_date)) %>%
  
  # Grouping by year and summarizing sales
  group_by(state, year) %>% 
  summarize(sales = sum(total_price)) %>%
  
  # Add a column that turns the numbers into a currency format 
  # (makes it in the plot optically more appealing)
  # mutate(sales_text = scales::dollar(sales)) <- Works for dollar values
  mutate(sales_text = scales::dollar(sales, big.mark = ".", 
                                     decimal.mark = ",", 
                                     prefix = "", 
                                     suffix = " €"))
#> `summarise()` has grouped output by 'state'. You can override using the
#> `.groups` argument.
sales_by_state_in_year
# Plot
salesYearStatePlot <- sales_by_state_in_year  %>%
  # Set up x, y, fill
  ggplot(aes(x = year, y = sales, fill = state)) +
  
  # Geometries
  geom_col() + # Run up to here to get a stacked bar plot
  
  # Facet
  facet_wrap(~ state) +
  
  # Formatting
  scale_y_continuous(labels = scales::dollar_format(big.mark = ".", 
                                                    decimal.mark = ",", 
                                                    prefix = "", 
                                                    suffix = " €")) +
  labs(
    title = "Revenue by year and state",
    subtitle = "Each state is ordered by years 2015 to 2019",
    fill = "States:"
  ) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
salesYearStatePlot